CW ADD: Adding a ‘TOT_PRCP’ row that sums up the total precipitation
between SNOW and PRCP. This row will be used in Question 3.
Local weather and local environmental and climate factors
Let’s pull in our air quality data. It contains measurements of daily
PM2.5 and air quality index values taken from various locations around
Manhattan.
PM2.5 includes particles less than or equal to 2.5 micrometers and is
also called fine particle pollution. The AQI is an index for reporting
daily air quality. It tells how clean or polluted the air is, and what
associated health effects might be a concern, especially for
ground-level ozone and particle pollution.
Let’s load the new data and have a look at it’s structure:
DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', 'DAILY_AQI_VALUE')]
colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
str(DailyAQ_00_22)
## 'data.frame': 7097 obs. of 3 variables:
## $ DATE : chr "1/1/00" "1/2/00" "1/3/00" "1/4/00" ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
xkablesummary(DailyAQ_00_22)
Table: Statistics summary.
|
|
DATE
|
PM2.5
|
AQI
|
|
Min
|
Length:7097
|
Min. :-1.2
|
Min. : 0
|
|
Q1
|
Class :character
|
1st Qu.: 5.9
|
1st Qu.: 25
|
|
Median
|
Mode :character
|
Median : 9.0
|
Median : 38
|
|
Mean
|
NA
|
Mean :10.9
|
Mean : 41
|
|
Q3
|
NA
|
3rd Qu.:13.6
|
3rd Qu.: 54
|
|
Max
|
NA
|
Max. :86.0
|
Max. :167
|
xkabledplyhead(DailyAQ_00_22)
Head
|
DATE
|
PM2.5
|
AQI
|
|
1/1/00
|
39.8
|
112
|
|
1/2/00
|
41.3
|
115
|
|
1/3/00
|
30.8
|
90
|
|
1/4/00
|
16.4
|
60
|
|
1/5/00
|
7.8
|
33
|
We need to convert the date from a character string to an R type. We
also calculate year-over-year growth rates for both daily PM2.5 and AQI
and store them in a column.
We have about 7,000 observations between the years 2000 and 2022. A
few plots to help us visualize the data:
# distribution plot of pmi2.5 and daily AQI
mean_pm25 <- mean(DailyAQ_00_22$PM2.5)
mean_aqi <- mean(DailyAQ_00_22$AQI)
# TODO: combine plots into one frame
ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=PM2.5), na.rm=TRUE, alpha=0.5, color="black", fill='#BD2AE2', bins=100, binwidth=2) +
geom_vline(xintercept=mean_pm25, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_pm25 + 9, y=1000, label=paste(round(mean_pm25, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily PM2.5 Measurements", x="ug/m3 LC", y="Count")

ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=AQI), na.rm=TRUE, alpha=0.5, color="black", fill='#2DD164', bins=50, binwidth=5) +
geom_vline(xintercept=mean_aqi, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_aqi + 9, y=625, label=paste(round(mean_aqi, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily AQI Level", x="", y="Count")

# TODO: group these in same figure, separate plots
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = pm2.5_diffRate), na.rm = T, stat = "identity", color="#290DDA", size=1) +
geom_point(aes(x = year, y = pm2.5_diffRate), na.rm = TRUE, fill="#124CF2", shape = 23) +
labs(title="PM2.5 particulate year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)

ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = aqi_diffRate), na.rm = T, stat="identity", color="#043008", size=1) +
geom_point(aes(x = year, y = aqi_diffRate), na.rm = TRUE, fill="#E6E930", shape = 23) +
labs(title="AQI year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)

# ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
# labs(title="AQI and growth rate and growht/decay rate by year in NYC", x="Year", y="ug/m3 LC") +
# scale_y_continuous(sec.axis = sec_axis(~., name="Year-over-year Diff (%)")) +
# theme(
# axis.title.y = element_text(color = "#2DD164", size = 13),
# axis.title.y.right = element_text(color = "#E6E930", size = 13)
# )
Next, we combine our new dataset with the NYC weather data based on
the date. The days without a matching air quality measurement will be
dropped after merge.
# merge data frame by date
DailyAQ_merged <- merge(DailyAQ_00_22, NYweath_00, by="DATE")
# select required columns
DailyAQ_merged <- DailyAQ_merged[ , c('DATE', 'year.x', 'month', 'PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW')]
colnames(DailyAQ_merged)[2] <- "year"
str(DailyAQ_merged)
## 'data.frame': 7063 obs. of 9 variables:
## $ DATE : Date, format: "2000-01-01" "2000-01-02" ...
## $ year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ month: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
## $ TMAX : num 50 60 64 60 47 49 38 51 58 52 ...
## $ TMIN : num 34 43 51 46 29 35 30 37 44 40 ...
## $ PRCP : num 0 0 0 0.7 0 0 0 0.02 0.84 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
xkablesummary(DailyAQ_merged)
Table: Statistics summary.
|
|
DATE
|
year
|
month
|
PM2.5
|
AQI
|
TMAX
|
TMIN
|
PRCP
|
SNOW
|
|
Min
|
Min. :2000-01-01
|
Min. :2000
|
03 : 634
|
Min. :-1.2
|
Min. : 0.0
|
Min. : 13.0
|
Min. :-1.0
|
Min. :0.00
|
Min. : 0.00
|
|
Q1
|
1st Qu.:2007-02-06
|
1st Qu.:2007
|
01 : 617
|
1st Qu.: 5.9
|
1st Qu.: 25.0
|
1st Qu.: 48.0
|
1st Qu.:36.0
|
1st Qu.:0.00
|
1st Qu.: 0.00
|
|
Median
|
Median :2012-03-16
|
Median :2012
|
05 : 615
|
Median : 9.1
|
Median : 38.0
|
Median : 64.0
|
Median :48.0
|
Median :0.00
|
Median : 0.00
|
|
Mean
|
Mean :2011-12-20
|
Mean :2011
|
04 : 611
|
Mean :10.9
|
Mean : 41.1
|
Mean : 62.6
|
Mean :48.4
|
Mean :0.13
|
Mean : 0.09
|
|
Q3
|
3rd Qu.:2017-05-27
|
3rd Qu.:2017
|
12 : 610
|
3rd Qu.:13.6
|
3rd Qu.: 54.0
|
3rd Qu.: 79.0
|
3rd Qu.:63.0
|
3rd Qu.:0.05
|
3rd Qu.: 0.00
|
|
Max
|
Max. :2022-09-26
|
Max. :2022
|
07 : 579
|
Max. :86.0
|
Max. :167.0
|
Max. :103.0
|
Max. :83.0
|
Max. :7.57
|
Max. :27.30
|
|
NA
|
NA
|
NA
|
(Other):3397
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
Linear Model with daily air quality and weather variables
# subset to numerical variables
DailyAQ_numerics <- DailyAQ_merged[ , c('PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'year')]
# combine PRCP and SNOW into single value
#DailyAQ_numerics$PRCP <- DailyAQ_numerics$PRCP + DailyAQ_numerics$SNOW
#DailyAQ_numerics <- subset(DailyAQ_numerics, select = -c(SNOW))
DailyAQ_numerics$year <- DailyAQ_numerics$year - 2000
Correlation analysis
Lattice pairs plot
pairs(DailyAQ_numerics)

pairs.panels(DailyAQ_numerics,
method = "pearson", # correlation method
hist.col = "red", # set histogram color
density = TRUE, # show density plots
ellipses = TRUE, # show correlation ellipses
smoother = TRUE,
lm = TRUE,
main = "Pairs Plot Of Weather and Air Quality Numerical Variables",
cex.labels=0.75
)
Another way to look at correlation using the corrplot
function:
DailyAQ_cor <- cor(DailyAQ_numerics)
corrplot(DailyAQ_cor, method="number", title="Correlation Plot Of Weather and Air Quality Numerical Variables", mar = c(2, 2, 2, 2))

From the pearson correlation plot above, we can see a significantly
large, positive correlation between PM2.5 concentrations and the daily
AQI values. This is expected as PM2.5 are heavily weighed in
calculations of AQI. Unfortunately, the correlation significance among
our weather and air quality variables is relatively weak. However, we
will still attempt a linear model between them below.
# yearly average and year-over year growth of daily AQI and PM2.5
ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = aqi_avg), stat="identity", color="#2DD164", size=1) +
geom_point(aes(x = year, y = aqi_avg), na.rm = TRUE, fill="#457108", shape = 21) +
labs(title="Average AQI by year in NYC", x="Year", y="AQI value")

ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = pm2.5_avg), stat="identity", color="#BD2AE2", size=1) +
geom_point(aes(x = year, y = pm2.5_avg), na.rm = TRUE, fill="#124CF2", shape = 21) +
labs(title="Average PM2.5 particulate amount by year in NYC", x="Year", y="Year-over-year Diff (%)")

Linear models
Let’s start by creating a linear model to describe the relationship
between AQI and year.
aqi_fit <- lm(AQI ~ year, data = DailyAQ_numerics)
summary(aqi_fit)
##
## Call:
## lm(formula = AQI ~ year, data = DailyAQ_numerics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.34 -13.11 -1.49 11.06 126.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.4369 0.4497 136.6 <2e-16 ***
## year -1.7748 0.0342 -51.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.4 on 7061 degrees of freedom
## Multiple R-squared: 0.276, Adjusted R-squared: 0.276
## F-statistic: 2.69e+03 on 1 and 7061 DF, p-value: <2e-16
xkabledply(aqi_fit, title = paste("First Linear Model: ", format( formula(aqi_fit) )))
First Linear Model: AQI ~ year
|
|
Estimate
|
Std. Error
|
t value
|
Pr(>|t|)
|
|
(Intercept)
|
61.44
|
0.4497
|
136.6
|
0
|
|
year
|
-1.77
|
0.0342
|
-51.8
|
0
|
The coefficient for the date regressor is significant, and has a
negative effect on daily AQI by a very small factor of .005. Although
the p-value of the F-statistic is significant, date still only explains
28% of the variability in daily AQI measurements.
ggplot(DailyAQ_00_22, aes(x = year, y = AQI)) +
geom_point(alpha = 0.5, color = "#2DD164", position = "jitter") +
labs(x = "Year", y = "AQI Value", title = "Daily AQI Values From 2000-2022 With Trend Line") +
geom_smooth(method = 'lm', formula = 'y ~ x', color = "black", fill="black")

The plot displays a slightly downward treng in daily AQI, but there
is a lot of noise distorting the fit.
Adding month as a categorical regressor
In our first analysis, we analyzed linear trends of TMAX over time
and determined a slight positive correlation observed over the years
1900-2022. Based on that fit, we hypothesized that seasonality trends
had an impact on model performance.
We believe seasonality also effects daily AQI measurements.
# NYC weather - Avg TMAX by month
NYweath_Monthly_Avg <- NYweath_00 %>%
group_by(month) %>%
summarize(avg_max_temp = mean(TMAX, na.rm=T),
avg_min_temp = mean(TMIN, na.rm=T))
ggplot(NYweath_Monthly_Avg, aes(x = as.numeric(month), y = avg_max_temp)) +
geom_line(color="#F21E1E", size = 2) +
geom_point(na.rm = TRUE, fill="#126BF4", shape = 21, size = 4) +
labs(title="Average TMAX By Month in NYC", x="Month", y="Temperature (°F)") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

DailyAQ_monthly <- DailyAQ_merged %>%
group_by(month) %>%
summarize(pm2.5_avg = mean(PM2.5, na.rm=T),
aqi_avg = mean(AQI, na.rm=T))
# calculate growth/decay rates month-over-month
DailyAQ_monthly <- DailyAQ_monthly %>%
mutate(pm2.5_diffRate = ((pm2.5_avg - lag(pm2.5_avg)) / pm2.5_avg) * 100,
aqi_diffRate = ((aqi_avg - lag(aqi_avg)) / aqi_avg) * 100
)
# populate January rates based on December
DailyAQ_monthly[1, 4]$pm2.5_diffRate <- ((DailyAQ_monthly$pm2.5_avg[1] - DailyAQ_monthly$pm2.5_avg[12]) / DailyAQ_monthly$pm2.5_avg[1]) * 100
DailyAQ_monthly[1, 5]$aqi_diffRate <- ((DailyAQ_monthly$aqi_avg[1] - DailyAQ_monthly$aqi_avg[12]) / DailyAQ_monthly$aqi_avg[1]) * 100
# yearly average and year-over year growth of daily AQI and PM2.5
# TODO: combine with month-over-month change plot
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_avg)) +
geom_line(color="#47ABE9", size = 2) +
geom_point(na.rm = TRUE, fill="#C10808", shape = 21, size = 4) +
labs(title="Average AQI By Month in NYC", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_diffRate)) +
geom_line(na.rm = TRUE, stat="identity", color="#043008", size=2) +
geom_point(na.rm = TRUE, fill="#E6E930", shape = 21) +
labs(title="Average AQI month-over-month change rate", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

Let’s modify our last model to attempt to fit seasonality by adding
month as a categorical regressor and our
variable-of-interest from the last project - TMAX - to
predict daily AQI.
aqi_fit2 <- lm(AQI ~ TMAX + month, data = DailyAQ_merged)
summary(aqi_fit2)
##
## Call:
## lm(formula = AQI ~ TMAX + month, data = DailyAQ_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.72 -14.66 -3.03 11.52 124.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.9722 1.3444 18.58 < 2e-16 ***
## TMAX 0.6781 0.0271 25.03 < 2e-16 ***
## month02 -5.9768 1.1604 -5.15 2.7e-07 ***
## month03 -19.3975 1.1672 -16.62 < 2e-16 ***
## month04 -33.0288 1.2871 -25.66 < 2e-16 ***
## month05 -37.9619 1.4304 -26.54 < 2e-16 ***
## month06 -38.8086 1.5925 -24.37 < 2e-16 ***
## month07 -37.7940 1.6857 -22.42 < 2e-16 ***
## month08 -40.4565 1.6563 -24.43 < 2e-16 ***
## month09 -43.7962 1.5422 -28.40 < 2e-16 ***
## month10 -34.8408 1.3458 -25.89 < 2e-16 ***
## month11 -19.9973 1.2209 -16.38 < 2e-16 ***
## month12 -7.9325 1.1470 -6.92 5.1e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20 on 7050 degrees of freedom
## Multiple R-squared: 0.149, Adjusted R-squared: 0.147
## F-statistic: 103 on 12 and 7050 DF, p-value: <2e-16
xkabledply(aqi_fit2, title = paste("Second Linear Model: ", format( formula(aqi_fit2) )))
Second Linear Model: AQI ~ TMAX + month
|
|
Estimate
|
Std. Error
|
t value
|
Pr(>|t|)
|
|
(Intercept)
|
24.972
|
1.3444
|
18.58
|
0
|
|
TMAX
|
0.678
|
0.0271
|
25.03
|
0
|
|
month02
|
-5.977
|
1.1604
|
-5.15
|
0
|
|
month03
|
-19.398
|
1.1672
|
-16.62
|
0
|
|
month04
|
-33.029
|
1.2871
|
-25.66
|
0
|
|
month05
|
-37.962
|
1.4304
|
-26.54
|
0
|
|
month06
|
-38.809
|
1.5925
|
-24.37
|
0
|
|
month07
|
-37.794
|
1.6857
|
-22.42
|
0
|
|
month08
|
-40.456
|
1.6563
|
-24.43
|
0
|
|
month09
|
-43.796
|
1.5422
|
-28.40
|
0
|
|
month10
|
-34.841
|
1.3458
|
-25.89
|
0
|
|
month11
|
-19.997
|
1.2209
|
-16.38
|
0
|
|
month12
|
-7.933
|
1.1470
|
-6.92
|
0
|
The regression coefficient for TMAX is significant and positively
correlated, with each degree increase resulting in AQI increasing by a
factor of 0.68. All months, when compared to January, have a negative
impact on AQI, with September having the largest difference. The p-value
of the model’s F-statistic is also significant, allowing us to reject
the null hypothesis and conclude that there’s a significant relationship
between our chosen predictors and the daily AQI value. However, the
\(R^2\) for our model is only
.149, which indicates that only 14.7% of the variation in
daily AQI can be explained by TMAX and month.
Seasonality can cause a poor linear model. Properly testing it would
require developing a seasonality time-series model to properly fit the
data.
Check for multicollinearity
# model VIF scores
xkablevif(aqi_fit2, title = "Model 2 VIF")
Model 2 VIF
|
month02
|
month03
|
month04
|
month05
|
month06
|
month07
|
month08
|
month09
|
month10
|
month11
|
month12
|
TMAX
|
|
1.78
|
1.98
|
2.32
|
2.88
|
3.36
|
3.79
|
3.59
|
3.01
|
2.38
|
1.96
|
1.84
|
4.25
|
The VIF values of all regressors are acceptable.
k-NN model to predict month based on weather and air quality
data
A k-NN model can help us further analyze the seasonality effect by
attempting to predict the month based on AQI and
TMAX variables.
Evaluate relationships via scatter plots - scale and center
data
- scatter plots of AQI,TMAX
- check composition of labels (months)
Plot variables
ggplot(DailyAQ_merged) +
geom_point(aes(x=AQI, y=TMAX, color=month), alpha = 0.7) +
labs(title = "Daily Maximum Temperature vs Daily Air Quality Index Value Distinguished By Month",
x = "Daily AQI Value",
y = "Daily Maximum Temperature (F)") +
scale_color_brewer(palette = "Paired")

Center and scale our data
DailyAQ_scaled <- as.data.frame(scale(DailyAQ_merged[4:9], center = TRUE, scale = TRUE))
str(DailyAQ_scaled)
## 'data.frame': 7063 obs. of 6 variables:
## $ PM2.5: num 3.839 4.039 2.642 0.727 -0.417 ...
## $ AQI : num 3.282 3.421 2.264 0.876 -0.373 ...
## $ TMAX : num -0.6996 -0.1462 0.0752 -0.1462 -0.8656 ...
## $ TMIN : num -0.872 -0.328 0.155 -0.147 -1.173 ...
## $ PRCP : num -0.356 -0.356 -0.356 1.502 -0.356 ...
## $ SNOW : num -0.113 -0.113 -0.113 -0.113 -0.113 ...
Create train and test data sets with 4:1 splits, as well as label
sets.
set.seed(1000)
DailyAQ_sample <- sample(2, nrow(DailyAQ_scaled), replace=TRUE, prob=c(0.80, 0.20))
DailyAQ_training <- DailyAQ_scaled[DailyAQ_sample == 1, ]
DailyAQ_test <- DailyAQ_scaled[DailyAQ_sample == 2, ]
DailyAQ_trainLabels <- DailyAQ_merged[DailyAQ_sample == 1, 3]
DailyAQ_testLabels <- DailyAQ_merged[DailyAQ_sample == 2, 3]
nrow(DailyAQ_training)
## [1] 5664
nrow(DailyAQ_test)
## [1] 1399
Build kNN model
# set kval
kval <- 3
# build model
DailyAQ_pred <- knn(train = DailyAQ_training,
test = DailyAQ_test,
cl = DailyAQ_trainLabels,
k = kval)
# confusion matrix
DailyAQ_confusionMatrix <- caret::confusionMatrix(DailyAQ_pred, reference = DailyAQ_testLabels)
DailyAQ_pred_accuracy <- DailyAQ_confusionMatrix$overall['Accuracy']
xkabledply(as.matrix(DailyAQ_confusionMatrix), title = paste("ConfusionMatrix for k = ", kval))
ConfusionMatrix for k = 3
|
|
01
|
02
|
03
|
04
|
05
|
06
|
07
|
08
|
09
|
10
|
11
|
12
|
|
01
|
60
|
38
|
16
|
7
|
1
|
0
|
0
|
0
|
0
|
5
|
13
|
50
|
|
02
|
24
|
33
|
38
|
10
|
0
|
0
|
0
|
0
|
0
|
4
|
18
|
18
|
|
03
|
14
|
13
|
37
|
26
|
7
|
1
|
0
|
0
|
2
|
11
|
27
|
11
|
|
04
|
2
|
3
|
20
|
52
|
22
|
6
|
2
|
2
|
3
|
36
|
25
|
6
|
|
05
|
0
|
1
|
2
|
20
|
38
|
19
|
10
|
8
|
28
|
27
|
7
|
0
|
|
06
|
0
|
0
|
0
|
1
|
8
|
39
|
34
|
25
|
25
|
7
|
1
|
0
|
|
07
|
0
|
0
|
0
|
0
|
4
|
15
|
41
|
34
|
13
|
3
|
0
|
0
|
|
08
|
0
|
0
|
0
|
2
|
2
|
15
|
23
|
24
|
13
|
3
|
0
|
0
|
|
09
|
0
|
0
|
1
|
1
|
9
|
14
|
9
|
10
|
17
|
4
|
1
|
0
|
|
10
|
0
|
3
|
2
|
6
|
8
|
5
|
1
|
0
|
3
|
24
|
8
|
1
|
|
11
|
2
|
1
|
9
|
6
|
4
|
1
|
0
|
0
|
0
|
6
|
15
|
6
|
|
12
|
16
|
13
|
9
|
3
|
0
|
0
|
0
|
0
|
0
|
2
|
5
|
19
|
xkabledply(data.frame(DailyAQ_confusionMatrix$byClass), title=paste("k = ", kval))
k = 3
|
|
Sensitivity
|
Specificity
|
Pos.Pred.Value
|
Neg.Pred.Value
|
Precision
|
Recall
|
F1
|
Prevalence
|
Detection.Rate
|
Detection.Prevalence
|
Balanced.Accuracy
|
|
Class: 01
|
0.508
|
0.898
|
0.316
|
0.952
|
0.316
|
0.508
|
0.390
|
0.0843
|
0.0429
|
0.1358
|
0.704
|
|
Class: 02
|
0.314
|
0.913
|
0.228
|
0.943
|
0.228
|
0.314
|
0.264
|
0.0751
|
0.0236
|
0.1036
|
0.614
|
|
Class: 03
|
0.276
|
0.911
|
0.248
|
0.922
|
0.248
|
0.276
|
0.262
|
0.0958
|
0.0264
|
0.1065
|
0.594
|
|
Class: 04
|
0.388
|
0.900
|
0.290
|
0.933
|
0.290
|
0.388
|
0.332
|
0.0958
|
0.0372
|
0.1279
|
0.644
|
|
Class: 05
|
0.369
|
0.906
|
0.237
|
0.948
|
0.237
|
0.369
|
0.289
|
0.0736
|
0.0272
|
0.1144
|
0.637
|
|
Class: 06
|
0.339
|
0.921
|
0.279
|
0.940
|
0.279
|
0.339
|
0.306
|
0.0822
|
0.0279
|
0.1001
|
0.630
|
|
Class: 07
|
0.342
|
0.946
|
0.373
|
0.939
|
0.373
|
0.342
|
0.356
|
0.0858
|
0.0293
|
0.0786
|
0.644
|
|
Class: 08
|
0.233
|
0.955
|
0.293
|
0.940
|
0.293
|
0.233
|
0.260
|
0.0736
|
0.0172
|
0.0586
|
0.594
|
|
Class: 09
|
0.164
|
0.962
|
0.258
|
0.935
|
0.258
|
0.164
|
0.200
|
0.0743
|
0.0122
|
0.0472
|
0.563
|
|
Class: 10
|
0.182
|
0.971
|
0.393
|
0.919
|
0.393
|
0.182
|
0.249
|
0.0944
|
0.0172
|
0.0436
|
0.576
|
|
Class: 11
|
0.125
|
0.973
|
0.300
|
0.922
|
0.300
|
0.125
|
0.176
|
0.0858
|
0.0107
|
0.0357
|
0.549
|
|
Class: 12
|
0.171
|
0.963
|
0.284
|
0.931
|
0.284
|
0.171
|
0.213
|
0.0793
|
0.0136
|
0.0479
|
0.567
|
How does k affect classification accuracy?
evaluateK = function(k, train_set, val_set, train_class, val_class){
# Build knn with k neighbors considered.
set.seed(1000)
class_knn = knn(train = train_set, #<- training set cases
test = val_set, #<- test set cases
cl = train_class, #<- category for classification
k = k) #<- number of neighbors considered
tab = table(class_knn, val_class)
# Calculate the accuracy.
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
# call evaluateK function for each odd k-value between 1 to 21
knn_different_k = sapply(seq(1, 21, by = 2),
function(x) evaluateK(x,
train_set = DailyAQ_training,
val_set = DailyAQ_test,
train_class = DailyAQ_trainLabels,
val_class = DailyAQ_testLabels))
# Reformat the results
knn_different_k = data.frame(k = knn_different_k[1,],
accuracy = knn_different_k[2,])
ggplot(knn_different_k, aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
labs(title = "kNN Model Accuracy vs k-value",
x = "Model k-value",
y = "Accuracy")

It seems 13-nearest neighbors is a decent choice because that’s the
greatest improvement in predictive accuracy before the incremental
improvement trails off. With an accuracy of 0.32, our model predicting
month based on TMAX and AQI is not a strong
fit.
Local weather and local human social and economic activity
Chris will look at the local weather data and how it affects human
behavior. Will look for correlations between precipitation, temperature,
stock market, COVID tests, and crime
First step is to Transform Precip to Yes/No Factor Var. Will use
PRCP_TOT to account for all PRCP.
# Add a column to convert PRCP to a binary factor variable. Don't care how much it rains, only if it rains.
NYweath_prcpFact <-NYweath_final
NYweath_prcpFact$PRCP_factor <- cut(NYweath_final$TOT_PRCP, c(-Inf,0, Inf), labels = c(0,1))
NYweath_prcpFact$PRCP_factor <- as.factor(NYweath_prcpFact$PRCP_factor)
Crime data
Initial import of the data. Due to the size of the data, I imported
it once, aggregated the data into a new data frame that only includes
date and arrest count. This was exported and saved in the Git. The code
below imports directly from that aggregated dataset.
#NYcrime <- data.frame(read.csv("/Users/christopherwasher/Documents/DATS6101/NYPD_Arrests_Data__Historic_.csv"))
#NYcrime_agg <- NYcrime %>% count(ARREST_DATE)
NYcrime_count <- tibble(read.csv("./data/NYPD_Arrests_counts.csv"))
NYcrime_count$ARREST_DATE <- as.Date(NYcrime_count$ARREST_DATE, format = "%Y-%m-%d")
#NYcrime_count$day <- format(NYcrime_count$ARREST_DATE, format="%d")
#NYcrime_count$month <- format(NYcrime_count$ARREST_DATE, format="%m")
#NYcrime_count$year <- format(NYcrime_count$ARREST_DATE, format="%Y")
colnames(NYcrime_count)[2] <- "ARREST_DATE"
head(NYcrime_count)
## # A tibble: 6 × 6
## X ARREST_DATE NUM_ARREST day month year
## <int> <date> <int> <int> <int> <int>
## 1 1 2006-01-01 551 1 1 2006
## 2 2 2007-01-01 589 1 1 2007
## 3 3 2008-01-01 769 1 1 2008
## 4 4 2009-01-01 764 1 1 2009
## 5 5 2010-01-01 958 1 1 2010
## 6 6 2011-01-01 900 1 1 2011
Now will do summary statistics and basic EDA on the Crime Count
data
crime_plot <- plot(NYcrime_count$ARREST_DATE, NYcrime_count$NUM_ARREST)

crime_boxplot <- boxplot(NYcrime_count$NUM_ARREST)
Add the Crime data to the NY Weather Data, subsetting the weather data
to after 1/1/2022
crimeWeather <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
NYcrime_count <- NYcrime_count[order(NYcrime_count$ARREST_DATE),]
tail(crimeWeather)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 55819 2021-12-26 26 12 2021 51 39 NA 0.00 0 0.00 0
## 55820 2021-12-27 27 12 2021 39 34 NA 0.09 0 0.09 1
## 55821 2021-12-28 28 12 2021 47 36 NA 0.05 0 0.05 1
## 55822 2021-12-29 29 12 2021 44 41 NA 0.14 0 0.14 1
## 55823 2021-12-30 30 12 2021 49 43 NA 0.05 0 0.05 1
## 55824 2021-12-31 31 12 2021 55 48 NA 0.00 0 0.00 0
NYweath_crime <- cbind(crimeWeather, NYcrime_count$NUM_ARREST)
colnames(NYweath_crime)[12] <- c("NUM_ARREST")
#NYweath_crime_plot <- plot(sqrt(NYweath_crime$PRCP), NYweath_crime$NUM_ARREST)
#boxplot((NYweath_crime$TOT_PRCP))
NY_weathcrime_ggplot <- ggplot(NYweath_crime,
aes(x = TMAX, y =NUM_ARREST)) +
geom_point(aes(colour = PRCP_factor), alpha = 0.5) +
labs(x = "Temperature (ºF)", y = "Number of Daily Arrests",
title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot

NY_weathcrime_ggplot2 <- NYweath_crime %>%
sample_frac(0.25) %>%
ggplot(aes(x = TMAX, y =NUM_ARREST)) +
geom_point(aes(shape = PRCP_factor, colour = month)) +
labs(x = "Temperature (ºF)", y = "Number of Daily Arrests",
title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot2
Initially made a boxplot of precipitation to observe the distribution..
It is extremely skewed. However, because I’m only interested in
determining if precipitation has an effect, will build a linear model
using PRCP as a Factor.
crimeWeath_lm <- lm(NUM_ARREST ~ TMAX + PRCP_factor + year,
data = NYweath_crime)
crimeWeathMonth_lm <- lm(NUM_ARREST ~ (TMAX + PRCP_factor + year + month),
data = NYweath_crime)
#crimeWeathTMIN_lm <- lm(NUM_ARREST ~ (TMIN + PRCP_factor),
# data = NYweath_crime)
summary(crimeWeathMonth_lm)
##
## Call:
## lm(formula = NUM_ARREST ~ (TMAX + PRCP_factor + year + month),
## data = NYweath_crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1029.2 -190.2 0.3 202.7 675.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97442.248 1530.816 63.65 < 2e-16 ***
## TMAX 2.338 0.404 5.79 7.6e-09 ***
## PRCP_factor1 -46.130 7.368 -6.26 4.1e-10 ***
## year -47.963 0.760 -63.07 < 2e-16 ***
## month02 15.163 17.434 0.87 0.3845
## month03 3.697 17.531 0.21 0.8330
## month04 -52.883 19.351 -2.73 0.0063 **
## month05 -69.008 21.337 -3.23 0.0012 **
## month06 -125.430 23.518 -5.33 1.0e-07 ***
## month07 -160.821 25.009 -6.43 1.4e-10 ***
## month08 -131.473 24.384 -5.39 7.3e-08 ***
## month09 -144.147 22.672 -6.36 2.2e-10 ***
## month10 -82.766 19.830 -4.17 3.0e-05 ***
## month11 -135.518 18.046 -7.51 6.8e-14 ***
## month12 -205.302 17.138 -11.98 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 268 on 5829 degrees of freedom
## Multiple R-squared: 0.427, Adjusted R-squared: 0.426
## F-statistic: 310 on 14 and 5829 DF, p-value: <2e-16
The Linear model of Arrest Numbers as a result of temperature and
precipitation. The Coefficients are significant but the R^2 is 0. This
indicates there is a statistically significant relationship between
Arrests and TMAX and Precipitation but these variables do not explain
any of the variability in the data. Increasing TMAX correlated with an
increase in Arrests. And PRCP present is associated with a decreased
number of arrests.
Stock Market Data
Import the stock market data and convert the date column to a ‘Date’
data type. Also pulled out the ‘day’, ‘month’, and ‘year’ columns to
help in analysis.
One last note, will need to fill in the missing date and populated
the other columns with ‘NAs’. This will enable us to combine the stocks
data with the weather data.
NYstock <- tibble(read.csv("./data/Dow Jones Industrial Average Historical Data.csv"))
tail(NYstock)
## # A tibble: 6 × 7
## Date Price Open High Low Vol. Change..
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 11/11/22 33,749.18 33,797.75 33,815.82 33,394.49 418.43M 0.11%
## 2 11/14/22 33,537.16 33,645.97 33,963.62 33,534.27 339.05M -0.63%
## 3 11/15/22 33,594.17 33,804.97 33,984.90 33,321.26 380.64M 0.17%
## 4 11/16/22 33,556.60 33,554.93 33,682.39 33,521.23 288.59M -0.11%
## 5 11/17/22 33,547.37 33,277.00 33,615.82 33,245.78 301.53M -0.03%
## 6 11/18/22 33,747.14 33,606.59 33,827.83 33,541.07 296.37M 0.60%
NYstock$Date <- as.Date(NYstock$Date, format = "%m/%d/%y")
NYstock2 <- NYstock
NYstock2 <- NYstock2 %>%
complete(Date = seq.Date(min(Date), max(Date), by="day"))
options(scientific=T, digits = 10)
# This is all just test code for figuring out how to clean the data.
# Not part of final script.
#NYstocktest <- NYstock2
#NYstocktest$Vol. = substr(NYstocktest$Vol.,1,nchar(NYstocktest$Vol.)-1)
#tail(NYstocktest)
#NYstocktest$Price <- gsub(",", "", NYstocktest$Price)
#NYstocktest[3:5] <- lapply(NYstocktest[3:5], gsub, pattern = ",", replacement = "")
#NYstocktest$Change.. <- gsub("%", "", NYstocktest$Change..)
#NYstocktest[2:7] <- sapply(NYstocktest[2:7], as.numeric)
###
NYstock2$Vol. = substr(NYstock2$Vol., 1, nchar(NYstock2$Vol.) - 1)
NYstock2[2:5] <- lapply(NYstock2[2:5], gsub, pattern = ",", replacement = "")
NYstock2$Change.. <- gsub("%", "", NYstock2$Change..)
NYstock2[2:7] <- sapply(NYstock2[2:7], as.numeric)
NYstock2$day <- format(NYstock2$Date, format="%d")
NYstock2$month <- format(NYstock2$Date, format="%m")
NYstock2$year <- format(NYstock2$Date, format="%Y")
head(NYstock2)
## # A tibble: 6 × 10
## Date Price Open High Low Vol. Change.. day month year
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 1979-12-25 839. 839. 842. 833. NA 0 25 12 1979
## 2 1979-12-26 838. 839. 843. 834. NA -0.12 26 12 1979
## 3 1979-12-27 840. 838. 843. 834. NA 0.23 27 12 1979
## 4 1979-12-28 839. 840. 843. 835. NA -0.14 28 12 1979
## 5 1979-12-29 NA NA NA NA NA NA 29 12 1979
## 6 1979-12-30 NA NA NA NA NA NA 30 12 1979
summary(NYstock2)
## Date Price Open
## Min. :1979-12-25 Min. : 759.13 Min. : 759.980
## 1st Qu.:1990-09-15 1st Qu.: 2733.83 1st Qu.: 2733.387
## Median :2001-06-06 Median : 9447.96 Median : 9438.680
## Mean :2001-06-06 Mean :10214.49 Mean :10212.138
## 3rd Qu.:2012-02-26 3rd Qu.:13171.78 3rd Qu.:13170.677
## Max. :2022-11-18 Max. :36799.65 Max. :36722.600
## NA's :4848 NA's :4848
## High Low Vol.
## Min. : 767.410 Min. : 729.950 Min. : 1.5900
## 1st Qu.: 2747.680 1st Qu.: 2715.142 1st Qu.: 40.3575
## Median : 9521.945 Median : 9348.660 Median :152.8850
## Mean :10274.204 Mean :10147.937 Mean :168.8159
## 3rd Qu.:13242.507 3rd Qu.:13098.073 3rd Qu.:257.3475
## Max. :36952.650 Max. :36636.000 Max. :922.6800
## NA's :4848 NA's :4848 NA's :6878
## Change.. day month year
## Min. :-22.610000 Length:15670 Length:15670 Length:15670
## 1st Qu.: -0.460000 Class :character Class :character Class :character
## Median : 0.050000 Mode :character Mode :character Mode :character
## Mean : 0.040395
## 3rd Qu.: 0.570000
## Max. : 11.370000
## NA's :4848
options(scientific=T, digits = 3)
Really only care about the volume of data. will remove all other
columns and only work with Date + Vol. Will combine that witht he
weather data for further analysis.
NYstock_final <- NYstock2[,c("Date", "Vol.")]
NYstock_final <- subset(NYstock_final, Date <= "2022-09-26")
weather_stockDates <- subset(NYweath_prcpFact, DATE >= "1979-12-25")
stockWeather <- cbind(weather_stockDates, NYstock_final)
colnames(stockWeather)[13] <- c("Volume")
Now will do EDA on the the volume data to build a linear regression
model. First will look at normality and look for any correlations.
stockWeather_rmNA <- subset(stockWeather, !is.na(Volume))
stock_hist <- hist(stockWeather_rmNA$Volume)

Histogram shows the data is right skewed. Will use sqrt of the volume
to normalize.
stockWeather_rmNA$Volume_norm <- sqrt(stockWeather_rmNA$Volume)
stockWeather_rmNA <- subset(stockWeather_rmNA, select = -c(Date))
stockWeather_90s <- subset(stockWeather_rmNA, year >= 1988 & year <= 1999)
hist(stockWeather_rmNA$Volume_norm)

boxplot(stockWeather_rmNA$Volume_norm)

The distribution of sqrt Volume is considerably more normal. Will now
look at correlations with Weather data. The boxplot shows there are no
outliers after normalizing the data.
pairs.panels(stockWeather_rmNA[c("TMAX", "TOT_PRCP","PRCP_factor",
"Volume","Volume_norm")],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = FALSE, # show density plots
ellipses = FALSE # show correlation ellipses
)

There are no strong correlations present in the data. Will next look
at a scatter plot of the Stock Volume vs TMAX, categorized by days with
PRCP.
NY_weathstock_scatter <- ggplot(stockWeather_rmNA, aes(x = TMAX, y =Volume_norm)) +
geom_point(aes(colour = PRCP_factor)) +
labs(x = "Temperature", y = "Total Daily DOW Trade Volume",
title = "Weather Patterns for DOW Trade Volume")
NY_weathstock_scatter

## Trying again with the 90's stock data.
NY_90s_weathstock_scatter <- ggplot(stockWeather_90s, aes(x = TMAX, y =Volume_norm)) +
geom_point(aes(colour = PRCP_factor)) +
labs(x = "Temperature (ºF)", y = "Normalized Daily DOW Trade Volume (M)",
title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter

NY_90s_weathstock_scatter2 <- stockWeather_90s %>%
sample_frac(0.3) %>%
ggplot(aes(x = TMAX, y =Volume_norm)) +
geom_point(aes(colour = month, shape = PRCP_factor)) +
labs(x = "Temperature (ºF)", y = "Normalized Daily DOW Trade Volume (M)",
title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter2

stock_LM <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
stockWeather_rmNA)
summary(stock_LM)
##
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month,
## data = stockWeather_rmNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.414 -2.203 -0.663 2.724 14.811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.92e+02 7.64e+00 -103.76 < 2e-16 ***
## TMAX -8.70e-03 4.31e-03 -2.02 0.04389 *
## PRCP_factor1 -2.84e-02 8.04e-02 -0.35 0.72375
## year 4.02e-01 3.81e-03 105.40 < 2e-16 ***
## month02 -2.86e-01 1.92e-01 -1.49 0.13696
## month03 1.53e-01 1.91e-01 0.80 0.42478
## month04 -2.01e-01 2.12e-01 -0.95 0.34321
## month05 -3.62e-01 2.31e-01 -1.57 0.11699
## month06 -2.62e-01 2.54e-01 -1.03 0.30286
## month07 -5.68e-01 2.71e-01 -2.10 0.03600 *
## month08 -9.47e-01 2.65e-01 -3.58 0.00035 ***
## month09 -2.91e-01 2.46e-01 -1.18 0.23692
## month10 -1.11e-01 2.16e-01 -0.52 0.60609
## month11 -4.67e-01 2.01e-01 -2.33 0.02003 *
## month12 -6.34e-01 1.90e-01 -3.34 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.57 on 8738 degrees of freedom
## Multiple R-squared: 0.562, Adjusted R-squared: 0.561
## F-statistic: 801 on 14 and 8738 DF, p-value: <2e-16
stock_LM_90s <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
stockWeather_90s)
summary(stock_LM_90s)
##
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month,
## data = stockWeather_90s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.575 -0.746 -0.142 0.596 8.329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.62e+02 1.19e+01 -72.61 < 2e-16 ***
## TMAX 5.16e-03 2.31e-03 2.24 0.02541 *
## PRCP_factor1 -3.52e-02 4.37e-02 -0.81 0.42087
## year 4.35e-01 5.95e-03 73.10 < 2e-16 ***
## month02 -1.51e-01 1.03e-01 -1.46 0.14481
## month03 -2.52e-01 1.02e-01 -2.48 0.01322 *
## month04 -1.40e-01 1.12e-01 -1.24 0.21419
## month05 -4.00e-01 1.22e-01 -3.29 0.00102 **
## month06 -4.46e-01 1.35e-01 -3.31 0.00096 ***
## month07 -4.75e-01 1.44e-01 -3.31 0.00095 ***
## month08 -4.18e-01 1.39e-01 -3.00 0.00270 **
## month09 -2.52e-01 1.29e-01 -1.96 0.05027 .
## month10 8.14e-02 1.13e-01 0.72 0.47338
## month11 4.88e-02 1.06e-01 0.46 0.64464
## month12 -2.97e-02 1.01e-01 -0.30 0.76783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 3018 degrees of freedom
## Multiple R-squared: 0.641, Adjusted R-squared: 0.64
## F-statistic: 385 on 14 and 3018 DF, p-value: <2e-16
The Linear Model that incorporates all the Stock data from
1988-present had a statistically significant TMAX, year, and some of the
months.
The stock data subset to the 90s generated a similar model.
Find the confidence intervals of the coefficients.
COVID Data
Looking at the effect of precipitation and temperature on the number
of positive COVID cases. Using the “CASE_COUNT” parameter for the NYC
Covid dataset. CASE_COUNT represents the count of patients tested who
were confirmed to be COVID-19 cases on date_of_interest
options(scientific=T, digits = 3)
NYcovid <- tibble(read.csv("./data/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv"))
NYcovid <- select(NYcovid, 1:3)
head(NYcovid)
## # A tibble: 6 × 3
## date_of_interest CASE_COUNT PROBABLE_CASE_COUNT
## <chr> <int> <int>
## 1 02/29/2020 1 0
## 2 03/01/2020 0 0
## 3 03/02/2020 0 0
## 4 03/03/2020 1 0
## 5 03/04/2020 5 0
## 6 03/05/2020 3 0
colnames(NYcovid)[1] <- "DATE"
NYcovid$DATE <- as.Date(NYcovid$DATE, format = "%m/%d/%Y")
NYcovid$day <- format(NYcovid$DATE, format="%d")
NYcovid$month <- format(NYcovid$DATE, format="%m")
NYcovid$year <- format(NYcovid$DATE, format="%Y")
head(NYcovid)
## # A tibble: 6 × 6
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year
## <date> <int> <int> <chr> <chr> <chr>
## 1 2020-02-29 1 0 29 02 2020
## 2 2020-03-01 0 0 01 03 2020
## 3 2020-03-02 0 0 02 03 2020
## 4 2020-03-03 1 0 03 03 2020
## 5 2020-03-04 5 0 04 03 2020
## 6 2020-03-05 3 0 05 03 2020
summary(NYcovid)
## DATE CASE_COUNT PROBABLE_CASE_COUNT day
## Min. :2020-02-29 Min. : 0 Min. : 0 Length:991
## 1st Qu.:2020-11-02 1st Qu.: 574 1st Qu.: 76 Class :character
## Median :2021-07-08 Median : 1437 Median : 343 Mode :character
## Mean :2021-07-08 Mean : 2534 Mean : 478
## 3rd Qu.:2022-03-12 3rd Qu.: 2796 3rd Qu.: 644
## Max. :2022-11-15 Max. :54947 Max. :5882
## month year
## Length:991 Length:991
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Next, Looked at normality of the COVID count data. The counts were
extremely skewed to the right. First removed multiple rounds of outliers
using the outlierKD2 funciton. After removing all outliers, the data was
still skewed right but less extreme. Used a square-root transform to
normalize the data.
covid_plot <- plot(NYcovid$DATE, NYcovid$CASE_COUNT)

covid_boxplot <- boxplot(NYcovid$CASE_COUNT)

NYcovid_rmOuts <- outlierKD2(NYcovid, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 42
## Proportion (%) of outliers: 4.4
## Mean of the outliers: 21841
## Mean without removing outliers: 2534
## Mean if we remove outliers: 1680
## Outliers successfully removed
NYcovid_rmOuts2 <- outlierKD2(NYcovid_rmOuts, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 15
## Proportion (%) of outliers: 1.6
## Mean of the outliers: 5696
## Mean without removing outliers: 1680
## Mean if we remove outliers: 1615
## Outliers successfully removed
NYcovid_rmOuts3 <- outlierKD2(NYcovid_rmOuts2, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 7
## Proportion (%) of outliers: 0.8
## Mean of the outliers: 5200
## Mean without removing outliers: 1615
## Mean if we remove outliers: 1588
## Outliers successfully removed
NYcovid_rmOuts4 <- outlierKD2(NYcovid_rmOuts3, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 2
## Proportion (%) of outliers: 0.2
## Mean of the outliers: 5094
## Mean without removing outliers: 1588
## Mean if we remove outliers: 1581
## Outliers successfully removed
covid_plot <- plot(NYcovid_rmOuts4$DATE, NYcovid_rmOuts4$CASE_COUNT)
tail(NYcovid_rmOuts3)
## # A tibble: 6 × 6
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year
## <date> <int> <int> <chr> <chr> <chr>
## 1 2022-11-10 1957 621 10 11 2022
## 2 2022-11-11 1580 395 11 11 2022
## 3 2022-11-12 1477 539 12 11 2022
## 4 2022-11-13 1332 514 13 11 2022
## 5 2022-11-14 2861 800 14 11 2022
## 6 2022-11-15 2120 652 15 11 2022
sqrt_count <- sqrt(NYcovid_rmOuts3$CASE_COUNT)
#hist(sqrt_count)
NYcovid_final <- cbind(NYcovid_rmOuts4, sqrt_count)
head(NYcovid_final)
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year sqrt_count
## 1 2020-02-29 1 0 29 02 2020 1.00
## 2 2020-03-01 0 0 01 03 2020 0.00
## 3 2020-03-02 0 0 02 03 2020 0.00
## 4 2020-03-03 1 0 03 03 2020 1.00
## 5 2020-03-04 5 0 04 03 2020 2.24
## 6 2020-03-05 3 0 05 03 2020 1.73
Add the Covid data to the NY Weather Data, subsetting the weather data
to after 2/29/2022
covWeather <- subset(NYweath_prcpFact, DATE >= ("2020-02-29"))
NYcovid_finaldates <- subset(NYcovid_final, DATE <= "2022-09-26")
tail(covWeather)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 56088 2022-09-21 21 09 2022 80 62 NA 0.00 0 0.00 0
## 56089 2022-09-22 22 09 2022 75 57 NA 0.47 0 0.47 1
## 56090 2022-09-23 23 09 2022 63 51 NA 0.00 0 0.00 0
## 56091 2022-09-24 24 09 2022 69 49 NA 0.00 0 0.00 0
## 56092 2022-09-25 25 09 2022 72 59 NA 1.11 0 1.11 1
## 56093 2022-09-26 26 09 2022 74 58 NA 0.00 0 0.00 0
NYweath_prcpCov <- cbind(covWeather, NYcovid_finaldates$CASE_COUNT,
NYcovid_finaldates$sqrt_count)
colnames(NYweath_prcpCov)[12:13] <- c("CASE_COUNT", "sqrt_count")
covCount_prcp_plot <- plot(NYweath_prcpCov$sqrt_count, sqrt(NYweath_prcpCov$PRCP))

NYweath_cov_final <- NYweath_prcpCov[,c(1:5, 8, 10:13)]
Plot of COV case count vs precipitation. no apparent relationship,
however, more interested in effect of precip not so much about the
correlation in prcp
T-test comparing Covid positive counts on days with precipitation vs
days without prcp.
cov_prcp1 <- subset(NYweath_cov_final, PRCP_factor == 1)
cov_prcp0 <- subset(NYweath_cov_final, PRCP_factor == 0)
cov_count_ttest <- t.test(cov_prcp0$sqrt_count, cov_prcp1$sqrt_count)
cov_count_ttest
##
## Welch Two Sample t-test
##
## data: cov_prcp0$sqrt_count and cov_prcp1$sqrt_count
## t = 0.2, df = 650, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.02 2.49
## sample estimates:
## mean of x mean of y
## 36.3 36.1
cov_count_bplot <- ggplot()+
geom_boxplot(data = NYweath_cov_final,
aes(y = sqrt_count, x = PRCP_factor)) +
labs(title = "COVID Positive Counts")
cov_count_bplot

## Repeating this EDA looking only at Covid cases from 2021+.
cov_2021 <- subset(NYweath_cov_final, year >= 2021)
cov_2021count_bplot <- ggplot()+
geom_boxplot(data = cov_2021, aes(y = sqrt_count, x = PRCP_factor)) +
labs(title = "2021 COVID Positive Counts")
cov_2021count_bplot

cov_2021prcp1 <- subset(cov_2021, PRCP_factor == 1)
cov_2021prcp0 <- subset(cov_2021, PRCP_factor == 0)
cov_2021count_ttest <- t.test(cov_2021prcp0$sqrt_count, cov_2021prcp1$sqrt_count)
cov_2021count_ttest
##
## Welch Two Sample t-test
##
## data: cov_2021prcp0$sqrt_count and cov_2021prcp1$sqrt_count
## t = 1, df = 414, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.06 4.10
## sample estimates:
## mean of x mean of y
## 40.2 38.7
No significant difference in the mean Covid case counts on days with
precipitation or without. However, there was a greater difference in the
means when only incorporating Covid from 2021+.
covWeath_final_scatter <- ggplot(NYweath_cov_final,
aes(x = TMAX,
y =sqrt_count,
)) +
geom_point(aes(colour = month, shape = PRCP_factor)) +
labs(x = "Temperature",
y = "Square Root of Total Daily DOW Trade Volume",
title = "Weather Patterns for Covid Case Counts")
covWeath_final_scatter

Will now build a linear model that incorporates temperature,
precipitation, and Month to predict Covid counts.
library(psych)
pairs.panels(NYweath_cov_final[4:10],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = FALSE, # show density plots
ellipses = FALSE # show correlation ellipses
)

cov_weathLM <- lm(sqrt_count ~ TMAX + PRCP_factor + year,
data = NYweath_cov_final)
summary(cov_weathLM)
##
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year, data = NYweath_cov_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.45 -9.44 -1.96 10.18 42.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.51e+04 1.25e+03 -12.09 <2e-16 ***
## TMAX -3.11e-01 2.87e-02 -10.82 <2e-16 ***
## PRCP_factor1 -4.84e-01 1.01e+00 -0.48 0.63
## year 7.49e+00 6.17e-01 12.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.3 on 873 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.23, Adjusted R-squared: 0.227
## F-statistic: 86.8 on 3 and 873 DF, p-value: <2e-16
cov_weathLM_month <- lm(sqrt_count ~ TMAX + PRCP_factor + year + month,
data = NYweath_cov_final)
summary(cov_weathLM_month)
##
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year + month,
## data = NYweath_cov_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.67 -8.91 -0.68 8.14 41.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.56e+04 1.22e+03 -12.74 < 2e-16 ***
## TMAX -5.53e-02 5.57e-02 -0.99 0.3210
## PRCP_factor1 -6.30e-01 9.33e-01 -0.68 0.4994
## year 7.74e+00 6.05e-01 12.79 < 2e-16 ***
## month02 -1.87e+01 3.03e+00 -6.16 1.1e-09 ***
## month03 -1.70e+01 2.99e+00 -5.67 1.9e-08 ***
## month04 -9.47e+00 3.15e+00 -3.01 0.0027 **
## month05 -1.91e+01 3.41e+00 -5.62 2.6e-08 ***
## month06 -2.63e+01 3.76e+00 -7.00 5.0e-12 ***
## month07 -2.15e+01 3.94e+00 -5.46 6.2e-08 ***
## month08 -2.07e+01 3.89e+00 -5.32 1.3e-07 ***
## month09 -2.32e+01 3.62e+00 -6.39 2.6e-10 ***
## month10 -2.54e+01 3.43e+00 -7.40 3.3e-13 ***
## month11 -1.66e+01 3.22e+00 -5.15 3.2e-07 ***
## month12 2.09e+00 3.31e+00 0.63 0.5289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.2 on 862 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.358, Adjusted R-squared: 0.347
## F-statistic: 34.3 on 14 and 862 DF, p-value: <2e-16
cov2021_weathLM <- lm(CASE_COUNT ~ TMAX + PRCP_factor + year + month,
data = cov_2021)
summary(cov2021_weathLM)
##
## Call:
## lm(formula = CASE_COUNT ~ TMAX + PRCP_factor + year + month,
## data = cov_2021)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3423 -829 -67 548 3129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.14e+06 1.92e+05 -5.94 5.0e-09 ***
## TMAX 2.11e-01 5.45e+00 0.04 0.9692
## PRCP_factor1 -7.93e+01 9.24e+01 -0.86 0.3908
## year 5.66e+02 9.50e+01 5.96 4.5e-09 ***
## month02 -1.73e+03 2.47e+02 -7.01 7.0e-12 ***
## month03 -1.81e+03 2.57e+02 -7.03 5.9e-12 ***
## month04 -1.82e+03 2.78e+02 -6.53 1.5e-10 ***
## month05 -1.84e+03 3.07e+02 -6.00 3.6e-09 ***
## month06 -2.28e+03 3.40e+02 -6.70 5.1e-11 ***
## month07 -1.81e+03 3.57e+02 -5.05 5.9e-07 ***
## month08 -1.89e+03 3.58e+02 -5.29 1.8e-07 ***
## month09 -2.28e+03 3.30e+02 -6.92 1.2e-11 ***
## month10 -2.61e+03 3.25e+02 -8.03 5.7e-15 ***
## month11 -2.36e+03 2.92e+02 -8.07 4.2e-15 ***
## month12 -1.02e+03 3.74e+02 -2.73 0.0064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1060 on 565 degrees of freedom
## (54 observations deleted due to missingness)
## Multiple R-squared: 0.244, Adjusted R-squared: 0.225
## F-statistic: 13 on 14 and 565 DF, p-value: <2e-16
The linear model that only incorporates TMAX, PRCP_factor, and year
has statistically significant coefficients for TMAX and year. This
indicates the model predicts that the sqrt of covid counts decreases by
0.3 for degree F increase in TMAX.
However, when we account for month, we lose the significance in the
TMAX variable. This indicates that the covid cases are more effected by
the seasonal changes rather than Temperature.
Let’s try to graph this! That did not work!
#covLM_plot <- covWeath_final_scatter +
# geom_smooth(method = lm, se = FALSE, fullrange = TRUE,
#aes(colour = PRCP_factor))
#covLM_plot
#ggplt
# Plotting multiple Regression Lines
#ggplt+geom_smooth(method=lm,se=FALSE,fullrange=TRUE,
# aes(color=Tree))
#covLM_plot
LR to predict Precipitation!
Let’s start by ingesting the Air Quality to add into this prediction
in place of COVID.
#DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
#DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', #'DAILY_AQI_VALUE')]
#colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
#str(DailyAQ_00_22)
#xkablesummary(DailyAQ_00_22)
#xkabledplyhead(DailyAQ_00_22)
Now let’s build a master dataframe that incorporates Date, Year,
Month, TMAX, PRCP, PRCP_Factor, Crime Count, DOW Volume, PM2.5, and
AQI.
# FORMAT AQ Data and subset dates
#AQ_forLogit <- DailyAQ_00_22
#AQ_forLogit$DATE <- as.Date(AQ_forLogit$DATE, format = "%m/%d/%y")
#AQ_forLogit$day <- format(AQ_forLogit$DATE, format="%d")
#AQ_forLogit$month <- format(AQ_forLogit$DATE, format="%m")
#AQ_forLogit$year <- format(AQ_forLogit$DATE, format="%Y")
#AQ_forLogit$year <- as.numeric(AQ_forLogit$year)
#AQ_forLogit2 <- AQ_forLogit %>%
# complete(DATE = seq.Date(min(DATE), max(DATE), by="day"))
#AQ_masterDates <- subset(AQ_forLogit2, year >= 2006 & year < 2022)
stock_masterDates <- subset(NYstock_final,Date >= "2006-01-01" &
Date <= "2021-12-31")
crime_masterDates <- NYcrime_count
weath_masterDates <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
master_log <- cbind(weath_masterDates,
crime_masterDates$NUM_ARREST,
stock_masterDates$Vol.)
colnames(master_log)[12:13] <- c('NUM_ARREST', 'Volume')
head(master_log)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 49981 2006-01-01 01 01 2006 42 32 NA 0.00 0 0.00 0
## 49982 2006-01-02 02 01 2006 48 39 NA 0.63 0 0.63 1
## 49983 2006-01-03 03 01 2006 40 33 NA 1.13 0 1.13 1
## 49984 2006-01-04 04 01 2006 38 29 NA 0.00 0 0.00 0
## 49985 2006-01-05 05 01 2006 50 37 NA 0.05 0 0.05 1
## 49986 2006-01-06 06 01 2006 43 30 NA 0.00 0 0.00 0
## NUM_ARREST Volume
## 49981 551 NA
## 49982 618 NA
## 49983 899 303
## 49984 1229 271
## 49985 1383 251
## 49986 1248 292
master_logFinal <- subset(master_log, !is.na(Volume))
master_logFinal$Volume_norm <- sqrt(master_logFinal$Volume)
Now let’s build the LR:
prcp_logit <- glm(PRCP_factor ~ TMAX + NUM_ARREST +
Volume_norm + year + month,
data = master_logFinal,
family = binomial(link = "logit"))
summary(prcp_logit)
##
## Call:
## glm(formula = PRCP_factor ~ TMAX + NUM_ARREST + Volume_norm +
## year + month, family = binomial(link = "logit"), data = master_logFinal)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.295 -0.957 -0.844 1.350 1.707
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 72.931220 20.475973 3.56 0.00037 ***
## TMAX -0.005733 0.003813 -1.50 0.13272
## NUM_ARREST -0.000950 0.000142 -6.67 2.6e-11 ***
## Volume_norm -0.010261 0.008914 -1.15 0.24972
## year -0.035885 0.010121 -3.55 0.00039 ***
## month02 0.242612 0.168088 1.44 0.14892
## month03 0.149488 0.169288 0.88 0.37722
## month04 0.400139 0.185421 2.16 0.03093 *
## month05 0.326598 0.205157 1.59 0.11140
## month06 0.503649 0.222956 2.26 0.02389 *
## month07 0.344674 0.241819 1.43 0.15406
## month08 0.201185 0.236342 0.85 0.39463
## month09 -0.049412 0.222747 -0.22 0.82445
## month10 0.074118 0.193259 0.38 0.70134
## month11 -0.012329 0.178675 -0.07 0.94499
## month12 0.046996 0.169456 0.28 0.78152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5256.9 on 4027 degrees of freedom
## Residual deviance: 5185.4 on 4012 degrees of freedom
## AIC: 5217
##
## Number of Fisher Scoring iterations: 4
Let’s assess the LR!
library(ModelMetrics)
prcpLR_cm <- confusionMatrix(actual = prcp_logit$y,
predicted = prcp_logit$fitted.values)
prcpLR_cm
## [,1] [,2]
## [1,] 2541 1399
## [2,] 43 45
prcpLR_acc <- (prcpLR_cm[2,2] + prcpLR_cm[1,1])/(sum(prcpLR_cm))
prcpLR_prec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[1,2])
prcpLR_rec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[2,1])
prcpLR_spec <- (prcpLR_cm[1,1])/(prcpLR_cm[1,1]+prcpLR_cm[1,2])
library(pROC)
master_logFinal$prob=predict(prcp_logit, type = c("response"))
prcp_roc <- roc(PRCP_factor ~ prob, data = master_logFinal)
prcp_auc <- auc(prcp_roc)
prcp_auc
## Area under the curve: 0.575
plot(prcp_roc)

library(pscl)
prcp_pr2 <- pR2(prcp_logit)
## fitting null model for pseudo-r2
prcp_pr2
## llh llhNull G2 McFadden r2ML r2CU
## -2.59e+03 -2.63e+03 7.15e+01 1.36e-02 1.76e-02 2.41e-02
This is NOT a good logistic regression!!!
Summary of Key Findings
Conclusion